/**********************************************************************/
/*
   Title: clean_agents_village.do
   Author: Robbie Dulin, CLotaire Boyer
   Created: 30 March 2020
   Description: Cleans agent data and creates village-level datasets
     of agent variables for March 2018, August 2018, and March 2019.
     Outputs cleaned files to be used for agent experiment dummy tables.
*/
/**********************************************************************/

// NOTE: Bank requested full confidentiality. Therefore, this portion 
// of the code is heavily redacted, and no log file is provided publically.

/*----------------------------------------------------*/
                  /* Section 0: Setup */
/*----------------------------------------------------*/

clear
set more off

// ** mount "J-PAL Raskin Transition Questionnaires & Data/Agent ratio experiment/Agent Data"
// ** Then set the path "agent_data" to the location of the mounted volume
global agent_data                   "/Volumes/MOUNT1"
global agent_aug_2020                  "/Volumes/MOUNT1"
// ** Then mount "J-PAL Raskin Transition Questionnaires & Data/Agent ratio experiment/Agent Data March 2018"
// // and then set the path "agents_mar_18" to the location of the mounted volume
global agents_mar_18                "/Volumes/MOUNT2"
global randomization                "$dropboxbase/04_ResearchDesign/02 Randomization/1803 Competition Randomization v2"
global impact_eval                  "$dropboxbase/04_ResearchDesign/02 Randomization/1705 Kab Randomization v6 - 8.2 milion, treat most java - FINAL"

cap log close
local prefix: display %tdCYND td(`c(current_date)')
log using "$log/`prefix'_clean_agents_village", replace text


/*----------------------------------------------------*/
          /* Section 1: Clean Mar 2018 Data */
/*----------------------------------------------------*/

*** Load Bank Agent data bank by bank
** Load Mar 18 BANK1 data
  u "$agents_mar_18/baseline_BANK1_agent_march2018.dta", clear
  di _N
  isid KODEAGEN

* Collapse sum of agents per village
  gen agent = 1
  gen agents_BANK1 = 1
  collapse (sum) num_agents = agent agents_BANK1, by(KodeProvinsi NamaProvinsi KodeKabKota NamaKabupatenKota KodeKec NamaKecamatan KodeDesaKel NamaDesaKel kodewilayah)
  //summ
  tempfile mar18_BANK1
  save `mar18_BANK1'

** Load Mar 18 BANK2 data
  u "$agents_mar_18/baseline_BANK2_agent_march2018.dta", clear
  //tab  BANK
  isid TID
  di _N

* Collapse sum of agents per village
  gen agent = 1
  gen agents_BANK2 = 1
  collapse (sum) num_agents = agent agents_BANK2, by(KodeProvinsi NamaProvinsi KodeKabKota NamaKabupatenKota KodeKec NamaKecamatan KodeDesaKel NamaDesaKel kodewilayah)
  //summ
  tempfile mar18_BANK2
  save `mar18_BANK2'

** Load Mar 18 BANK3 data
  u "$agents_mar_18/baseline_BANK3_agent_march2018.dta", clear
  //tab  BANK3
  keep if BANK3 == 1
  di _N
  isid KODEAGEN

* Collapse sum of agents per village
  gen agent = 1
  gen agents_BANK3 = 1
  collapse (sum) num_agents = agent agents_BANK3, by(KodeProvinsi NamaProvinsi KodeKabKota NamaKabupatenKota KodeKec NamaKecamatan KodeDesaKel NamaDesaKel kodewilayah)
  //summ
  tempfile mar18_BANK3
  save `mar18_BANK3'

** Load matched agents that were previously unmatched
  u "$agents_mar_18/matched_original_unmatched_agents_BANK2_BANK3.dta", clear
  gen agent = 1
  gen agents_BANK2 = bank == "BANK2"
  gen agents_BANK3 = bank == "BANK3"
  gen unmatched_agents_mar18 = 1
  tab agents_BANK2 bank
  tab agents_BANK3 bank
  rename value_prov KodeProvinsi
  rename value_kab  KodeKabKota
  rename value_kec  KodeKec
  collapse (sum) num_agents = agent agents_BANK3 agents_BANK2 unmatched_agents_mar18, by(KodeProvinsi NamaProvinsi KodeKabKota NamaKabupatenKota KodeKec NamaKecamatan kodewilayah)
  tostring kodewilayah, replace
  //summ
  tempfile matched_agents
  save `matched_agents'

** Append bank datasets together
  u `mar18_BANK1', clear
  di _N
  append using `mar18_BANK2'
  di _N
  append using `mar18_BANK3'
  di _N
  gen unmatched_agents_mar18 = .
  append using `matched_agents', force
  di _N
  //summ
  duplicates report kodewilayah
  drop if kodewilayah == ""
  //summ


  foreach bank in BANK1 BANK2 BANK3 {
    egen num_agents_`bank' = total(agents_`bank')
    egen num_kabs_`bank' = group(KodeProvinsi KodeKabKota) if agents_`bank' > 0 & agents_`bank' != .
    //summ num_agents_`bank'
    //summ num_kabs_`bank'
  }
  di _N


* Collapse agent counts to the village level
  collapse (sum) num_agents_mar18 = num_agents agents_BANK1 agents_BANK2 agents_BANK3 unmatched_agents_mar18, by(kodewilayah)
  //summ

  assert num_agents_mar18 == agents_BANK1 + agents_BANK2 + agents_BANK3

  tempfile agents_village_mar18
  save `agents_village_mar18'

  local num = _N
  di "`num' villages are in March 2018 data"
  // REDACTED
  collapse (sum) *agent*
  //summ


* save data merged with village datasets
use "$importable/kpm2017_clean.dta",clear
  merge 1:1 kodewilayah using `agents_village_mar18'
  drop _m

save "$cleaned/agent_allindonesia_mar18.dta",replace

  // keep all villages here, drop non-experimental villages below

** Merge in bank assignments
* load district data set
  preserve
  u "$agent_data/Supplementary data/20200214_groupAB_district_corrected.dta", clear
  rename provinsi namaprovinsi
  rename kotakabupaten namakabupatenkota
  keep namaprovinsi namakabupatenkota idkab BANK4
  tempfile bank
  save `bank'
  restore

* merge
  rename idkabu idkab
  merge m:1 idkab using `bank'
  assert _m != 2

  // // keep only experimental villages
  // keep if _m == 3
  drop _m
  // REDACTED

** Merge in assignments and strata
* Create kota dummy to facilitate merge
  gen kota1 = substr(Kode_Kabupaten, 1, 1) == "7"

* load treatment assignments
  preserve
  u "$randomization/comp_treatments.dta", clear
  rename province Provinsi
  rename kotakabupaten Kabupaten
  keep Provinsi Kabupaten stratum kota1 comp_treat

  // rename kabus to match names
  replace Kabupaten = "KOTA PAREPARE" if Kabupaten == "KOTA PARE PARE"
  replace Kabupaten = "LABUHAN BATU" if Kabupaten == "LABUHANBATU"
  replace Kabupaten = "LABUHAN BATU SELATAN" if Kabupaten == "LABUHANBATU SELATAN"

  tempfile strata
  save `strata'
  restore

* merge
  merge m:1 Provinsi Kabupaten kota1 using `strata'
  assert _m != 2
  // assert 1:1 mapping between district names and codes
  bysort Provinsi Kabupaten (idkab): assert idkab[1] == idkab[_N]

  di _N
  save "$cleaned/mar18_village_treats.dta", replace

/*----------------------------------------------------*/
          /* Section 2: Clean Mar 2019 Data */
/*----------------------------------------------------*/

* raw agents data
  import excel "$agent_data/2019/Mar 2019/Archive/Data Agen BANK2.xlsx", clear firstrow
  di _N
  ds
  duplicates report KODEAGEN
  duplicates report TID_NAME ALAMAT_TID KODEWILAYAH

  import excel "$agent_data/2019/Mar 2019/Archive/Data Agen BANK1.xlsx", clear firstrow
  ds
  duplicates report KODEAGEN
  duplicates report KODEAGEN BULAN
  duplicates report NAMAAGEN ALAMAT KODEWILAYAH

* Load Mar 19 agent dataset
  u "$agent_data/2019/Mar 2019/dataset/all_agent_mar2019.dta", clear
  //tab  bank
  sort kodewilayah
  di _N
  //tab  bank
  gen agent = 1
  //tab  bpnt
  //tab  bank bpnt
  count if kodewilayah == ""
  gen KABU = substr(kodewilayah, 1, 4)
  foreach bank in BANK1 BANK2 BANK3 {
    egen num_kabs_`bank' = group(KABU) if bank == "`bank'"
    //summ num_kabs_`bank'
  }



* Collapse agents to village-level
  collapse (sum) agents_total = agent num_agents_bpnt = bpnt , by(kodewilayah)
  //summ agents_total num_agents_bpnt
  drop if kodewilayah == ""
  tempfile mar19_agents
  save `mar19_agents'

* merge into village level dataset
  u "$agent_data/Supplementary data/kpm2017_clean.dta", clear
  merge 1:1 kodewilayah using `mar19_agents'
  drop _m
  // keep all for now and drop outside of A/B villages later

  * make agents in village 0 if not in agent dataset
  replace agents_total = 0  if agents_total == .
  replace num_agents_bpnt = 0 if num_agents_bpnt == .

  //summ agents_total num_agents_bpnt
  rename TotalPenerima beneficiaries
  destring beneficiaries, replace
  rename idkabu idkab

** Merge in bank assignments
* load district data set
  preserve
  u "$agent_data/Supplementary data/20200214_groupAB_district_corrected.dta", clear
  rename provinsi namaprovinsi
  rename kotakabupaten namakabupatenkota
  keep namaprovinsi namakabupatenkota idkab BANK4
  tempfile bank
  save `bank'
  restore

* merge in A/B districts and bank assignments
  merge m:1 idkab using `bank'
  assert _m != 2
  drop _m

** Merge in assignments and strata
* Create kota dummy to facilitate merge
  gen kota1 = substr(Kode_Kabupaten, 1, 1) == "7"
  //tab  Kode_Kabupaten kota1

* load treatment assignments
  preserve
  u "$randomization/comp_treatments.dta", clear
  rename province Provinsi
  rename kotakabupaten Kabupaten
  keep Provinsi Kabupaten stratum kota1 comp_treat
  // rename kabus to match names
  replace Kabupaten = "KOTA PAREPARE" if Kabupaten == "KOTA PARE PARE"
  replace Kabupaten = "LABUHAN BATU" if Kabupaten == "LABUHANBATU"
  replace Kabupaten = "LABUHAN BATU SELATAN" if Kabupaten == "LABUHANBATU SELATAN"
  tempfile strata
  save `strata'
  restore

* merge in treatment assignments
  merge m:1 Provinsi Kabupaten kota1 using `strata'
  assert _m != 2
  rename _m AB_match
  bysort Provinsi Kabupaten (idkab): assert idkab[1] == idkab[_N]

* merge in impact eval treatment assignments
  preserve
  u "$impact_eval/treatments.dta", clear
  sort namaprovinsi namakabupaten
  keep namaprovinsi namakabupaten treated finalstratum
  rename namaprovinsi Provinsi
  rename namakabupaten Kabupaten
  keep if treated != .
  // Need to create kota/kab variable to distinguish bewteen districts with same name in same province
  /* I identify which districts are experimental using ⁨05_Intervention⁩/01 Impact Evaluation⁩/20190523_Kab Kota Phase Expansion.xlsx,
  		which distinguishes between kota and kab for same-name districts. If a district is experimental in the above sheet, then I
  		assume that it corresponds to the district with the same name in the randomization file. I then assign kab status to these
  		districts to match the phase sheet. This distinguishes between districts in the same province with the same name. */
  gen kab = 1 if inlist(Kabupaten, "BIMA", "GORONTALO", "PEKALONGAN", "SERANG")

  // rename Kabupaten Pontianak to Mempawah (name change in 2014)
  // Note: Kota Pontianak is A/B district, kabupaten Mempawah/Pontianak is IE district
  replace Kabupaten = "MEMPAWAH" if Kabupaten == "PONTIANAK"
  tempfile IE_treats
  save `IE_treats'
  restore

  // abbreviate province names to match treatment assignment file
  replace Provinsi = "KEP. BANGKA BELITUNG" if Provinsi == "KEPULAUAN BANGKA BELITUNG"
  replace Provinsi = "D I YOGYAKARTA" if Provinsi == "DI YOGYAKARTA"
  replace Provinsi = "NTB" if Provinsi == "NUSA TENGGARA BARAT"

  // create kab variable to match
  gen kab = 1 if inlist(idkab, "5206", "7502", "3326", "3604")

  // merge treatment assignments
  merge m:1 Provinsi Kabupaten kab using "`IE_treats'", gen(IE_match)
  assert IE_match != 2

  // keep observations matched from either the impact eval or A/B districts
  count if AB_match == 3 | IE_match == 3
  keep if AB_match == 3 | IE_match == 3
  di _N
  //tab  treated
  //tab  finalstratum



*** Create main outcome variables
* Variable: ratio of BPNT agents to beneficiaries
  count if beneficiaries == 0
  gen agent_ratio_bpnt = num_agents_bpnt / beneficiaries
  //summ agent_ratio_bpnt
  //summ beneficiaries
  //summ num_agents_bpnt agents_total

* ratio of total agents to beneficiaries
  gen agent_ratio_total = agents_total / beneficiaries
  //summ agent_ratio_total

* Variable: village has at least 1 BPNT agent for every 250 beneficiaries
  gen ratio250_bpnt = (agent_ratio_bpnt) >= (1 / 250) if !missing(agent_ratio_bpnt)
  //tab  ratio250_bpnt

* village has at least 1 agent per 250 beneficiaries
  gen ratio250_total = (agent_ratio_total) >= (1 / 250) if !missing(agent_ratio_total)
  //tab  ratio250_total

* Variable: Dummy for village having at least 2 BPNT agents
  gen at_least_2_bpnt = num_agents_bpnt >= 2 if !missing(num_agents_bpnt)
  //tab  at_least_2_bpnt
  // bysort at_least_2_bpnt: summ num_agents_bpnt

* village has at least 2 agents
  gen at_least_2_total = agents_total >= 2 if !missing(agents_total)
  //tab  at_least_2_total

* Variable: Village fulfills 1:250 ratio (bpnt agents) and has at least 2 bpnt agents
  gen both_criteria_bpnt = ratio250_bpnt == 1 & at_least_2_bpnt == 1 if !missing(ratio250_bpnt) & !missing(at_least_2_bpnt)
  //tab  ratio250_bpnt at_least_2_bpnt
  //tab  both_criteria_bpnt

* village fulfills both requirements (total agents)
  gen both_criteria_total = ratio250_total == 1 & at_least_2_total == 1 if !missing(ratio250_total) & !missing(at_least_2_total)
  //tab  both_criteria_total

* Create treatment and stratum variables
  //tab  comp_treat
  gen groupA = comp_treat == 0
  //tab  groupA comp_treat
  //tab  stratum

* Verify that all 216 districts are present
  egen num_KABU_AB = group(Provinsi Kabupaten) if AB_match == 3
  qui summ num_KABU_AB
  assert `r(max)' == 216

  egen num_KABU_IE = group(Provinsi Kabupaten) if IE_match == 3
  qui summ num_KABU_IE
  assert `r(max)' == 105
  drop num_KABU*

** Merge in Mar 18 variables
  merge 1:1 kodewilayah using `agents_village_mar18'
  drop if _m == 2
  //tab  _m
  //tab  AB_match _m
  //tab  IE_match _m

// If village doesn't match, I assume it is because there was no agent in that village
// i.e., I assume the 2018 agent data is exhaustive, and non-matches can safely assumed to be 0s
  foreach var of varlist agents_BANK1 agents_BANK2 agents_BANK3 num_agents_mar18 {
    replace `var' = 0 if _m == 1
    // summ `var'
  }
  drop _m
  rename agents_BANK1       agents_BANK1_mar18
  rename agents_BANK2       agents_BANK2_mar18
  rename agents_BANK3   agents_BANK3_mar18

*** Create Mar 18 variables
* Variable: agent ratio in March 2018
  gen agent_ratio_mar18 = num_agents_mar18 / beneficiaries
  //summ agent_ratio_bpnt agent_ratio_mar18

* Variable: meet 1:250 ratio in March 2018
  gen ratio250_mar18 = agent_ratio_mar18 >= (1/250) if !missing(agent_ratio_mar18)
  //summ ratio250_*

* Variable: meet at least 2 agents criteria
  gen at_least_2_mar18 = num_agents_mar18 >= 2 if !missing(num_agents_mar18)
  //summ at_least_2*

* Variable: Meet both criteria
  gen both_criteria_mar18 = ratio250_mar18 == 1 & at_least_2_mar18 == 1 if !missing(ratio250_mar18) & !missing(at_least_2_mar18)
  //summ both_criteria*

* Bank dummies
  //tab  BANK4
  gen BANK3 = BANK4 == "BANK3"
  gen BANK1 = BANK4 == "BANK1" | BANK4 == "REDACTED"
  gen BANK2 = BANK4 == "BANK2"


* Variable: Number of agents belonging to the bank that was chosen to administer BPNT in the district
  // NOTE: REDACTED agents were apparently handled by BANK1
  //summ agents*mar18
  gen agents_assigned_bank = agents_BANK1_mar18 if BANK4 == "BANK1" | BANK4 == "REDACTED"
  replace agents_assigned_bank = agents_BANK2_mar18 if BANK4 == "BANK2"
  replace agents_assigned_bank = agents_BANK3_mar18 if BANK4 == "BANK3"
  //summ agents_assigned_bank

* Variable: number of baseline agents from other banks (not chosen)
  gen agents_other_banks = num_agents_mar18 - agents_assigned_bank
  //summ agents_other_banks

* keep needed variables
keep kodewilayah Kode_Provinsi Provinsi Kode_Kabupaten Kabupaten Kode_Kecamatan Kecamatan Kode_Desa NamaDesaKel ///
  idkab beneficiaries agent_ratio* both_criteria* ratio250* at_least_2* comp_treat agents_total ///
  num_agents_bpnt *mar18 stratum BANK4 agents_assigned_bank agents_other_banks BANK3 BANK1 BANK2 groupA ///
  AB_match IE_match treated finalstratum

* Create winsorized number of bpnt agents variable
  //summ num_agents_bpnt , d
  //tab  num_agents_bpnt
  gen num_agents_winsor_bpnt = num_agents_bpnt
  _pctile num_agents_bpnt, p(0.5 99.5)
  replace num_agents_winsor_bpnt = `r(r1)' if num_agents_bpnt < `r(r1)'
  replace num_agents_winsor_bpnt = `r(r2)' if num_agents_bpnt > `r(r2)' & num_agents_bpnt != .
  //summ num_agents_bpnt num_agents_winsor_bpnt

* Create winsorized agent ratio
  //summ agent_ratio_bpnt, d
  gen agent_ratio_winsor_bpnt = agent_ratio_bpnt
  _pctile agent_ratio_bpnt, p(0.5 99.5)
  replace agent_ratio_winsor_bpnt = `r(r1)' if agent_ratio_bpnt < `r(r1)'
  replace agent_ratio_winsor_bpnt = `r(r2)' if agent_ratio_bpnt > `r(r2)' & agent_ratio_bpnt != .
  //summ agent_ratio_bpnt agent_ratio_winsor_bpnt

* prepare to save
  rename *winsor* *w*
  rename agents_total num_agents_total

  la var num_agents_total         "\# Agents In Village"
  la var num_agents_mar18         "\# Agents In Village (Baseline)"
  la var num_agents_bpnt          "\# BPNT Agents In Village"
  la var num_agents_w_bpnt        "\# BPNT Agents In Village (Winsorized)"
  la var agent_ratio_bpnt         "Agents Per Beneficiary"
  la var agent_ratio_w_bpnt       "Agents Per Beneficiary (Winsorized)"
  la var agent_ratio_mar18        "Agents Per Beneficiary (Baseline)"
  la var both_criteria_bpnt       "Both Criteria"
  la var both_criteria_mar18      "Both Criteria (Baseline)"
  la var ratio250_bpnt            "Meet 1/250 Rule"
  la var ratio250_mar18           "Meet 1/250 Rule (Baseline)"
  la var at_least_2_bpnt          "Meet 2 Per Village Rule"
  la var at_least_2_mar18         "Meet 2 Per Village Rule (Baseline)"
  la var beneficiaries            "\# Beneficiaries in Village"
  la var agents_assigned_bank     "Baseline Agents From Chosen Bank"
  la var agents_other_banks       "Baseline Agents From Other Banks"
  la var BANK3                  "BANK3 Chosen"
  la var BANK1                      "BANK1 Chosen"
  la var BANK2                      "BANK2 Chosen"
  la var comp_treat               "Group B"
  la var groupA                   "Group A"
  quietly ds
  di "`r(varlist)'"

  //summ *total
  //summ *bpnt

* Save A/B dataset
  preserve
  keep if AB_match == 3
  di _N
  drop treated finalstratum IE_match AB_match
  save "$cleaned/mar19_village_treats.dta", replace
  restore

* Save impact eval dataset
  preserve
  //tab  IE_match
  keep if IE_match == 3
  //tab  treated
  di _N
  drop comp_treat stratum AB_match IE_match
  save "$cleaned/finance/mar19_village_IE_treats.dta", replace
  restore

/*----------------------------------------------------*/
        /* Section 3: Clean August 2018 Data */
/*----------------------------------------------------*/

*** Load Aug 18 bank agent level dataset
  u "$agent_data/2018/Aug 2018/dataset/all_agent_aug2018.dta", clear
  di _N
  gen KABU = substr(kodewilayah, 1, 4)

  //tab  BANK
  foreach bank in BANK1 BANK2 BANK3 {
    egen num_kabs_`bank' = group(KABU) if BANK == "`bank'"
    //summ num_kabs_`bank'
  }

  di _N
  //tab  BANK
  egen villages = group(kodewilayah)
  //summ villages

* Create BPNT agent dummy variable
  //tab  BANK BPNT1
  // BPNT1 variable indicates BPNT status of BANK2 and BANK3 agents
  // According to Poppy, all BANK1 agents in this dataset are BPNT agents
  gen bpnt_agent = BPNT1
  replace bpnt_agent = 1 if BANK == "BANK1"
  //tab  BANK
  //tab  BANK bpnt_agent
  gen agent = 1
  di _N

* Collapse to village level
  collapse (sum) num_agents_bpnt = bpnt_agent agents_total = agent, by(kodewilayah)
  local num = _N
  di "There are `num' villages in the Aug 18 agent-level dataset"
  //summ num_agents_bpnt agents_total

  tempfile agents_by_village
  save `agents_by_village'


*** Merge into village level data set
  u "$agent_data/Supplementary data/kpm2017_clean.dta", clear
  merge 1:1 kodewilayah using `agents_by_village'
  // 1,011 not matched in villages dataset; 3,822 unmatched in agent dataset

  ** Assume that villages not in the agent-level dataset are 0s for agent variables
  replace num_agents_bpnt = 0 if _m == 1
  replace agents_total = 0 if _m == 1
  drop _m
  //summ num_agents_bpnt agents_total
  rename TotalPenerima beneficiaries
  destring beneficiaries, replace
  rename idkabu idkab

  *** Merge in bank assignments
  preserve
  u "$agent_data/Supplementary data/20200214_groupAB_district_corrected.dta", clear
  keep idkab BANK4
  tempfile bank
  save `bank'
  restore

  merge m:1 idkab using `bank'
  assert _m != 2
  // keep if _m == 3
  drop _m

  ** Merge in strata
  * Create kota dummy to facilitate merge
  gen kota1 = substr(Kode_Kabupaten, 1, 1) == "7"

  // merge in strata
  merge m:1 Provinsi Kabupaten kota1 using `strata', gen(AB_match)
  assert AB_match != 2
  bysort Provinsi Kabupaten (idkab): assert idkab[1] == idkab[_N]

** Merge in impact eval treatments
  // Load (real) treatment and control status dataset
  preserve
  u "$impact_eval/phases_T_n.dta", clear
  keep namaprovinsi namakabupaten treated phase*

  tempfile phases_treats
  save `phases_treats'

  u "$impact_eval/phases_C_n.dta", clear
  keep namaprovinsi namakabupaten treated phase*

  append using `phases_treats'
  rename namakabupaten Kabupaten
  rename namaprovinsi Provinsi

  keep if treated != .
  // Need to create kota/kab variable to distinguish bewteen districts with same name in same province
  /* I identify which districts are experimental using ⁨05_Intervention⁩/01 Impact Evaluation⁩/20190523_Kab Kota Phase Expansion.xlsx,
  		which distinguishes between kota and kab for same-name districts. If a district is experimental in the above sheet, then I
  		assume that it corresponds to the district with the same name in the randomization file. I then assign kab status to these
  		districts to match the phase sheet. This distinguishes between districts in the same province with the same name. */
  gen kab = 1 if inlist(Kabupaten, "BIMA", "GORONTALO", "PEKALONGAN", "SERANG")
  // rename Kabupaten Pontianak to Mempawah (name change in 2014)
  // Note: Kota Pontianak is A/B district, kabupaten Mempawah/Pontianak is IE district
  replace Kabupaten = "MEMPAWAH" if Kabupaten == "PONTIANAK"
  tempfile treatments
  save `treatments'
  restore

  // abbreviate province names to match treatment assignment file
  replace Provinsi = "KEP. BANGKA BELITUNG" if Provinsi == "KEPULAUAN BANGKA BELITUNG"
  replace Provinsi = "D I YOGYAKARTA" if Provinsi == "DI YOGYAKARTA"
  replace Provinsi = "NTB" if Provinsi == "NUSA TENGGARA BARAT"

  // create kab variable to match
  gen kab = 1 if inlist(idkab, "5206", "7502", "3326", "3604")

* merge
  merge m:1 Provinsi Kabupaten kab using `treatments', gen(IE_match)
  assert IE_match != 2

  // keep observations matched from either the impact eval or A/B districts
  count if AB_match == 3 | IE_match == 3
  keep if AB_match == 3 | IE_match == 3

  * merge in impact eval strata because these are not in the phase randomization files
  preserve
  u "$impact_eval/treatments.dta", clear
  rename namaprovinsi Provinsi
  keep if treated != .
  rename namakabupaten Kabupaten
  keep Provinsi Kabupaten finalstratum
  gen kab = 1 if inlist(Kabupaten, "BIMA", "GORONTALO", "PEKALONGAN", "SERANG")
  // rename Kabupaten Pontianak to Mempawah (name change in 2014)
  // Note: Kota Pontianak is A/B district, kabupaten Mempawah/Pontianak is IE district
  replace Kabupaten = "MEMPAWAH" if Kabupaten == "PONTIANAK"
  tempfile IE_strata
  save `IE_strata'
  restore

  * merge
  merge m:1 Provinsi Kabupaten kab using `IE_strata'
  assert _m != 2
  drop _m

  * treated in September variable
  //tab  IE_match
  //tab  treated
  //tab  phase1
  gen treated_sep18 = treated == 1 & phase1 == 1 if treated != .
  //tab  treated_sep18
  //summ treated treated_sep18
  //tab  finalstratum

** Merge in treatment phase data for A/B districts
  * Prep phase data
  preserve
  import excel using "$dropboxbase/05_Intervention/01 Impact Evaluation/20190523_Kab Kota Phase Expansion.xlsx", sheet("Treatment status") firstrow clear
  rename DistrictID idkab
  drop if idkab == ""
  drop if Group == ""
  keep idkab Phase MonthreceiveBPNT
  tempfile district_data
  save `district_data'
  restore

  * Merge
  merge m:1 idkab using `district_data'
  assert _m != 2
  drop _m

  //tab  MonthreceiveBPNT
  gen bpnt_by_aug = inlist(MonthreceiveBPNT, "April", "January", "July", "May")
  //tab  bpnt_by_aug MonthreceiveBPNT

*** Create outcome variables
  * Agents per beneficiary
  count if beneficiaries == .
  gen agent_ratio_bpnt = num_agents_bpnt / beneficiaries
  //summ agent_ratio_bpnt

  * agents per beneficiary (total)
  //summ agents_total num_agents_bpnt
  gen agent_ratio_total = agents_total / beneficiaries
  //summ agent_ratio_total

  * Village has at least 1 BPNT agent for every 250 beneficiaries
  gen ratio250_bpnt = (agent_ratio_bpnt) >= (1 / 250) if !missing(agent_ratio_bpnt)
  //tab  ratio250_bpnt

  * Village has at least 1 BPNT agent for every 250 beneficiaries
  gen ratio250_total = (agent_ratio_total) >= (1 / 250) if !missing(agent_ratio_total)
  //tab  ratio250_total

  * Dummy for village having at least 2 agents
  gen at_least_2_bpnt = num_agents_bpnt >= 2 if !missing(num_agents_bpnt)
  //tab  at_least_2_bpnt

  * village has at least 2 agents (total)
  gen at_least_2_total = agents_total >= 2 if !missing(agents_total)
  //tab  at_least_2_total

  * Village fulfills 1:250 ratio (bpnt agents) and has at least 2 bpnt agents
  gen both_criteria_bpnt = ratio250_bpnt == 1 & at_least_2_bpnt == 1 if !missing(ratio250_bpnt) & !missing(at_least_2_bpnt)
  //tab  both_criteria_bpnt
  //tab  ratio250_bpnt at_least_2_bpnt

  * Village fulfills 1:250 ratio (total agents) and has at least 2 agents total
  gen both_criteria_total = ratio250_total == 1 & at_least_2_total == 1 if !missing(ratio250_total) & !missing(at_least_2_total)
  //tab  both_criteria_total

  * Dummy for in group B
  //tab  comp_treat

  * dummy for group a
  gen groupA = comp_treat == 0
  //tab  comp_treat groupA

** Merge in Mar 18 variables
  merge 1:1 kodewilayah using `agents_village_mar18'
  drop if _m == 2
  //tab  _m IE_match
  //tab  _m AB_match
  //tab  _m

  // assume village has 0 agents in Mar 18 if not in Mar 18 data
  foreach var of varlist agents_BANK1 agents_BANK2 agents_BANK3 num_agents_mar18 {
    replace `var' = 0 if _m == 1
    //summ `var'
  }
  drop _m
  rename agents_BANK1       agents_BANK1_mar18
  rename agents_BANK2       agents_BANK2_mar18
  rename agents_BANK3   agents_BANK3_mar18


* Create Mar 18 variables
  * Agent ratio in March 2018
  gen agent_ratio_mar18 = num_agents_mar18 / beneficiaries
  //summ agent_ratio_bpnt agent_ratio_mar18

  * Meet 1:250 ratio in March 2018
  gen ratio250_mar18 = agent_ratio_mar18 >= (1/250) if !missing(agent_ratio_mar18)
  //summ ratio250_bpnt*

  * Meet at least 2 agents criteria
  gen at_least_2_mar18 = num_agents_mar18 >= 2
  //summ at_least_2_bpnt*

  * Meet both criteria
  gen both_criteria_mar18 = ratio250_mar18 == 1 & at_least_2_mar18 == 1
  //summ both_criteria_bpnt*

  * Variables for first stage analysis
  // banks
  //tab  BANK4
  gen BANK3 = BANK4 == "BANK3"
  gen BANK1 = BANK4 == "BANK1" | BANK4 == "REDACTED"
  gen BANK2 = BANK4 == "BANK2"

  // agents from the assigned bank
  //summ agents*mar18
  gen agents_assigned_bank = agents_BANK1_mar18 if BANK4 == "BANK1" | BANK4 == "REDACTED"
  replace agents_assigned_bank = agents_BANK2_mar18 if BANK4 == "BANK2"
  replace agents_assigned_bank = agents_BANK3_mar18 if BANK4 == "BANK3"
  //summ agents_assigned_bank

* Create variable for number of baseline agents from other banks
  gen agents_other_banks = num_agents_mar18 - agents_assigned_bank
  //summ agents_other_banks

  keep kodewilayah Kode_Provinsi Provinsi Kode_Kabupaten Kabupaten Kode_Kecamatan Kecamatan Kode_Desa NamaDesaKel ///
    idkab beneficiaries agent_ratio* both_criteria* ratio250* at_least_2* comp_treat agents_total bpnt_by_aug ///
    num_agents_bpnt *mar18 groupA stratum BANK4 agents_assigned_bank agents_other_banks BANK3 BANK1 BANK2 treated_sep18 AB_match IE_match finalstratum

  //summ num_agents_bpnt if num_agents_mar18 > 0
  count if num_agents_bpnt == 0 & num_agents_mar18 > 0
  count if agents_total == 0 & num_agents_mar18 > 0

** Check outliers
  //summ num_agents_bpnt , d
  * Create winsorized number of bpnt agents variable
  egen num_agents_bpnt_995 = pctile(num_agents_bpnt), p(99.5)
  //summ num_agents_bpnt_995
  gen num_agents_winsor_bpnt = num_agents_bpnt
  replace num_agents_winsor_bpnt = num_agents_bpnt_995 if num_agents_bpnt > num_agents_bpnt_995 & num_agents_bpnt != .
  //summ num_agents_winsor_bpnt num_agents_bpnt

  //tab  Kode_Kabupaten if num_agents_bpnt >= num_agents_bpnt_995 & num_agents_bpnt != .
  //tab  Kabupaten if num_agents_bpnt >= num_agents_bpnt_995 & num_agents_bpnt != .
// NOTE: of villages with agent counts above 99.5th percentile, 170/191 are in kotas (district code starts with 7)

* Create winsorized agent ratio
  //summ agent_ratio_bpnt, d
  //tab  agent_ratio_bpnt if agent_ratio_bpnt > .6
// %browse beneficiaries agents_total num_agents_bpnt agent_ratio_bpnt if agent_ratio_bpnt > .6 & !missing(agent_ratio_bpnt)

  egen agent_ratio_bpnt_995 = pctile(agent_ratio_bpnt), p(99.5)
  //summ agent_ratio_bpnt_995
  gen agent_ratio_winsor_bpnt = agent_ratio_bpnt
  replace agent_ratio_winsor_bpnt = agent_ratio_bpnt_995 if agent_ratio_bpnt > agent_ratio_bpnt_995 & agent_ratio_bpnt != .
  //summ agent_ratio_bpnt agent_ratio_winsor_bpnt

  rename *winsor* *w*
  rename agents_total num_agents_total

  di _N
  la var groupA                   "Group A"
  la var num_agents_total         "\# Agents In Village"
  la var num_agents_mar18         "\# Agents In Village (Baseline)"
  la var num_agents_bpnt          "\# BPNT Agents In Village"
  la var num_agents_w_bpnt        "\# BPNT Agents In Village (Winsorized)"
  la var agent_ratio_bpnt         "Agents Per Beneficiary"
  la var agent_ratio_w_bpnt       "Agents Per Beneficiary (Winsorized)"
  la var agent_ratio_mar18        "Agents Per Beneficiary (Baseline)"
  la var both_criteria_bpnt       "Both Criteria"
  la var both_criteria_mar18      "Both Criteria (Baseline)"
  la var ratio250_bpnt            "Meet 1/250 Rule"
  la var ratio250_mar18           "Meet 1/250 Rule (Baseline)"
  la var at_least_2_bpnt          "Meet 2 Per Village Rule"
  la var at_least_2_mar18         "Meet 2 Per Village Rule (Baseline)"
  la var comp_treat               "Group B"
  la var beneficiaries            "\# Beneficiaries in Village"
  la var agents_assigned_bank     "Baseline Agents From Chosen Bank"
  la var agents_other_banks       "Baseline Agents From Other Banks"
  la var BANK3                  "BANK3 Chosen"
  la var BANK1                      "BANK1 Chosen"
  la var BANK2                      "BANK2 Chosen"
  quietly ds
  di "`r(varlist)'"

  //summ *total
  //summ *bpnt

* Save A/B dataset
  preserve
  keep if AB_match == 3
  di _N
  drop treated_sep18 IE_match AB_match finalstratum
  save "$cleaned/aug18_village_treats.dta", replace
  restore

* Save impact eval dataset
  preserve
  //tab  IE_match
  keep if IE_match == 3
  //tab  treated_sep18
  drop comp_treat stratum IE_match AB_match
  save "$cleaned/finance/aug18_village_IE_treats.dta", replace
  restore


/*----------------------------------------------------*/
              /* Section: August 2019 */
/*----------------------------------------------------*/
  u "$agent_data/2019/Sep 2019/agent_BANK1.dta", clear
  di _N
  duplicates report NoRegAgen
  duplicates drop NoRegAgen, force
  //tab  JenisBantuanPKHBPNT
  count if Kodewilayahgabungan == ""
  gen KABU = substr(Kodewilayahgabungan, 1, 4)
  di `r(r)'


  u "$agent_data/2019/Sep 2019/agent_BANK3.dta", clear
  di _N
  duplicates report NoRegAgen
  duplicates drop NoRegAgen, force
  //tab  JenisBantuanPKHBPNT
  gen KABU = substr(Kodewilayahgabungan, 1, 4)
  //tab  KABU
  di `r(r)'



/*----------------------------------------------------*/
               /* Section: August 2020 */
/*----------------------------------------------------*/

* load data for each bank and save
  local files : dir "$agent_data/2020/Maret_Agustus/" files "*.xlsx"
  local i = 1
  foreach file in `files' {
    * avoid hidden files in Mac OS
    if substr("`file'", 1, 2) != "._" {
      * import excel "$agent_data/2020/August/`file'", clear sheet("TEMPLATE UNTUK DIISI")
      import excel "$agent_data/2020/Maret_Agustus/`file'", clear sheet("TEMPLATE UNTUK DIISI")
      qui drop in 1/3
      rename (A B C D E F G H I J K L M N O P Q) ///
        (bank kode nama alamat bpnt jenis date edc KodeProvinsi NamaProvinsi KodeKabKota NamaKabupatenKota KodeKec NamaKecamatan KodeDesaKel NamaDesaKel kodewilayah)
      drop R S
      drop if bank == ""

      // fix date
      if bank == "BANK1" | bank == "REDACTED" {
        replace date = substr(date, 1, strlen(date)-5)
    		replace date = ustrtrim(date)
    		gen datetd = date(date, "MDY")
    		format datetd %td
    		//summ datetd , format
      }
      else {
        gen datetd = date(date, "DMY")
        format datetd %td
		    //summ datetd, format
      }

      tempfile file`i'
      qui save `file`i''
      local ++i
    }
  }

* append
  u `file1', clear
  append using `file2'
  append using `file3'
  append using `file4'

* clean
  di _N
  count if kodewilayah == ""
  di "`r(N)' agents with no area code"
  // drop if kodewilayah == ""
  count if datetd == .
  count if kode == ""
  drop if !inlist(bank, "BANK3", "BANK1", "BANK2", "REDACTED")
  //tab  bank

* check Mar 2018
  count if datetd <= td(30mar2018)
  di _N
  //summ datetd, format detail
  replace datetd = td(26mar2017) if datetd == td(26mar2107)
  replace datetd = td(24mar2017) if datetd == td(24mar2107)
  hist datetd if datetd > td(01jan2014)
  // bysort bank: summ datetd, detail format

* save
  drop nama alamat
  save "$cleaned/data_agen_all_aug2020.dta", replace


/*----------------------------------------------------*/
                /* August 2020 Checks */
/*----------------------------------------------------*/

  u "$cleaned/data_agen_all_aug2020.dta", clear
  duplicates drop bank kode, force
  //tab  bank bpnt
  gen KABU = substr(kodewilayah, 1, 4)
  foreach bank in BANK1 BANK2 REDACTED BANK3 {
    qui egen num_kabs_`bank' = group(KABU) if bank == "`bank'"
    //summ num_kabs_`bank'
  }

* merge in March 2018 to cross-reference
  preserve

*** Load Bank Agent data bank by bank
** Load Mar 18 BANK1 data
  u "$agents_mar_18/baseline_BANK1_agent_march2018.dta", clear
  di _N
  isid KODEAGEN
  rename KODEAGEN kode
  rename BANK bank
  keep kode bank kodewilayah
  tempfile BANK1
  save `BANK1'

** Load Mar 18 BANK2 data
  u "$agents_mar_18/baseline_BANK2_agent_march2018.dta", clear
  //tab  BANK
  isid TID
  di _N
  rename BANK bank
  rename TID kode
  keep kode bank kodewilayah
  tempfile BANK2
  save `BANK2'

** Load Mar 18 BANK3 data
  u "$agents_mar_18/baseline_BANK3_agent_march2018.dta", clear
  //tab  BANK3
  keep if BANK3 == 1
  di _N
  isid KODEAGEN
  rename KODEAGEN kode
  gen bank = "BANK3"
  keep kode bank kodewilayah
  tempfile BANK3
  save `BANK3'

** Load matched agents that were previously unmatched
  u "$agents_mar_18/matched_original_unmatched_agents_BANK2_BANK3.dta", clear
  rename KODEAGEN kode
  keep kode bank kodewilayah
  tostring kodewilayah, replace
  tempfile matched_agents
  save `matched_agents'

** Append bank datasets together
  u `BANK2', clear
  append using `BANK1'
  append using `BANK3'
  append using `matched_agents'
  di _N
  tempfile mar2018
  save `mar2018'
  //tab  bank
  isid bank kode
  restore

* merge in March 2018 by agent
  duplicates tag kode, gen(tag)
  //tab  tag

  merge m:1 kode using `mar2018'


  la def merge2018 1 "August 2020 only" 2 "March 2018 only" 3 "Matched"
  la val _m merge2018
  rename _m merge2018
  la var merge2018 "Aug 2020-Mar 2018 Merge"
  //tab  merge2018
  count if merge2018 == 2
  local march_unmatched = `r(N)'
  count if merge2018 == 3
  local matched = `r(N)'
  local matched_rate = (`matched' / (`matched' + `march_unmatched')) * 100
  di `matched_rate'
  //tab  merge2018 bank



* merge with PODES
  u "$cleaned/data_agen_all_aug2020.dta", clear
  destring KodeProvinsi, gen(R101)
  destring KodeKabKota, gen(R102)
  replace KodeKec = "" if missing(real(KodeKec))
  replace KodeDesaKel = "" if missing(real(KodeDesaKel))
  destring KodeKec, gen(R103)
  destring KodeDesaKel, gen(R104)

  merge m:1 R101 R102 R103 R104 using "$cleaned/podes_2018.dta", gen(podes_merge)
  /*
  Result                           # of obs.
  -----------------------------------------
  not matched                        71,770
      from master                    21,119  (podes_merge==1)
      from using                     50,651  (podes_merge==2)

  matched                            91,474  (podes_merge==3)
  -----------------------------------------
  */

  preserve
  keep if podes_merge == 1
  duplicates drop R101 R102 R103 R104, force
  di _N
  // 9245 vilages unmatced from Aug 2020 agent data
  restore

** check merge between 2018 PODES and 2019 PODES
  u "$cleaned/podes_2018.dta", clear
  foreach var of varlist R101 R102 R103 R104 {
    tostring `var', gen(`var'S)
  }

  replace R102S = "0" + R102S if strlen(R102S) == 1
  assert strlen(R102S) == 2

  replace R103S = "0" + R103S if strlen(R103S) == 2
  assert strlen(R103S) == 3

  replace R104S = "00" + R104S if strlen(R104S) == 1
  replace R104S = "0" + R104S if strlen(R104S) == 2
  assert strlen(R104S) == 3

  gen value_desa = R101S + R102S + R103S + R104S
  assert strlen(value_desa) == 10
  destring value_desa, replace
  keep R101 R102 R103 R104 value_desa

  * load 2019 area codes
  preserve
  import delimited "$data/07-09_Questionnaires&Data/Podes 2019/master_wilayah_pendataan_potensi_desa_2019__desa__.csv", clear varnames(1)
  di _N
  tempfile podes2019
  save `podes2019'
  restore

  * merge
  merge 1:1 value_desa using `podes2019'

  /*
  Result                           # of obs.
  -----------------------------------------
  not matched                           504
      from master                       252  (_merge==1)
      from using                        252  (_merge==2)

  matched                            83,685  (_merge==3)
  -----------------------------------------
  */
  sort value_desa
  // %browse value_desa _m if _m != 3



*** Check merge between PODEes 2014 and PODES 2019/2018
u "$data/07-09_Questionnaires&Data/Podes 2014/podes_desa_2014_clean.dta", clear
di _N
rename unique_id value_desa

tempfile podes2014
save `podes2014'

u `podes2019', clear
tostring value_desa, replace
merge 1:1 value_desa using `podes2014'
/*

    REDACTED

*/


** match between March 2019 and August 2020
* append March 2019 uncleaned datasets (to get kodeagen)
u "$agent_data/2019/Mar 2019/dataset/agen_BANK2_mar2019_noPII.dta", clear
gen kode = substr(kodeagen, 21, 8)
drop kodeagen
rename kode kodeagen
destring kodeprovinsi, replace
tostring bulan, replace

append using "$agent_data/2019/Mar 2019/dataset/agen_BANK1_mar2019_noPII.dta"
destring jenisagen, replace
tostring kodeprovinsi, replace

preserve
u "$agent_data/2019/Mar 2019/dataset/agen_BANK3_mar2019_noPII.dta", clear
tostring kodeagen, replace
tostring jenisagen, replace
tempfile BANK3
save `BANK3'
restore
append using  `BANK3'
//tab  bank
count if kodeagen == ""
drop if kodeagen == ""
duplicates drop kodeagen kodewilayah, force
duplicates report kodeagen
duplicates drop kodeagen, force
//tab  bpnt
di _N
// duplicates tag kodeagen, gen(tag)
// di _N
// order kodeagen, first
// gsort -kodeagen
//
// %browse if tag != 0

tempfile agents_mar19
save `agents_mar19'

// %browse 3 kodeagen if bank == "BANK2"
// %browse 3 kodeagen if bank == "BANK1"
// %browse 3 kodeagen if bank == "BANK3"

* overall merge
u "$cleaned/data_agen_all_aug2020.dta", clear
di _N
//tab  bpnt
destring bpnt, replace
rename kode kodeagen
duplicates tag kodeagen, gen(tag)
sort kodeagen
//tab  tag
duplicates drop kodeagen, force
// %browse 281  if tag != 0

merge 1:1 kodeagen using `agents_mar19', force
//tab  bpnt
di _N

/*
Result                           # of obs.
-----------------------------------------
not matched                       164,795
    from master                    65,043  (_merge==1)
    from using                     99,752  (_merge==2)

matched                            47,289  (_merge==3)
-----------------------------------------
*/
di (2 * 47289) / (2*47289 + 164795)

//tab   bpnt if _m == 1
la def m 1 "Aug 2020 only" 2 "Mar 2019 only" 3 "Matched", replace
la val _m m
//tab  _m bpnt , col
//tab  _m bank , col





  /*--------------------------------------------------------*/
      /* Section : Construct Aug 20 as alternative agent set */
  /*--------------------------------------------------------*/

u "$cleaned/data_agen_all_aug2020.dta", clear

gen agent20 = 1

destring KodeProvinsi, gen(R101)
destring KodeKabKota, gen(R102)
replace KodeKec = "" if missing(real(KodeKec))
replace KodeDesaKel = "" if missing(real(KodeDesaKel))
destring KodeKec, gen(R103)
destring KodeDesaKel, gen(R104)
replace kodewilayah = "" if missing(real(kodewilayah))


foreach var of varlist R101 R102 R103 R104 {
  tostring `var', replace
  replace `var' = "0" + `var' if strlen(`var') == 2
  replace `var' = "00" + `var' if strlen(`var') == 1
  assert strlen(`var') == 3
}

gen kodewilayah2 = R101 + R102 + R103 + R104

replace kodewilayah = kodewilayah2 if kodewilayah == ""

collapse (sum) num_agents20 = agent, by(kodewilayah)

tempfile mar20_agents
save `mar20_agents'

* merge into village level dataset
u "$agent_data/Supplementary data/kpm2017_clean.dta", clear
merge 1:1 kodewilayah using `mar20_agents'

/*
REDACTED
*/
drop if _merge == 2

// keep all for now and drop outside of A/B villages later

* make agents in village 0 if not in agent dataset
replace num_agents20 = 0  if num_agents20 == . & _merge  == 1
drop _merge
// duplicates drop R101 R102 R103 R104, force

save "$cleaned/data_agen_aug2020.dta", replace



cap log close
